#==============================================================================#
# 1-extract-profiles-bd.R
# Matt Curtis 27/06/22
# mjdcurtis@gmail.com
#------------------------------------------------------------------------------#
# 
#==============================================================================#
library(arrow)
library(tidyverse)
library(data.table)
library(tidytable)

#------------------------------------------------------------------------------#
rm(list = ls())
gc()

# set working directory once
setwd("~/Replication package/")
csv_file <- "./1_raw_data/1_2_geni/geni_profiles_patched.csv"
dest <- "./1_raw_data/1_2_geni/1_2_geni_chunked/geni_profiles" 


types<- rep("c",71) %>% paste0(collapse='')

sch<-schema(
  profile_id=string(),
  first_name=string(),
  middle_name=string(),
  last_name=string(),
  maiden_name=string(),
  suffix=string(),
  display_name=string(),
  gender=string(),
  occupation=string(),
  birth_range=string(),
  birth_start_day=double(),
  birth_start_month=double(),
  birth_start_year=double(),
  birth_start_circa=string(),
  birth_end_day=double(),
  birth_end_month=double(),
  birth_end_year=double(),
  birth_end_circa=string(),
  birth_city=string(),
  birth_state=string(),
  birth_county=string(),
  birth_country=string(),
  birth_place_name=string(),
  baptism_range=string(),
  baptism_start_day=double(),
  baptism_start_month=double(),
  baptism_start_year=double(),
  baptism_start_circa=string(),
  baptism_end_day=double(),
  baptism_end_month=double(),
  baptism_end_year=double(),
  baptism_end_circa=string(),
  baptism_city=string(),
  baptism_state=string(),
  baptism_county=string(),
  baptism_country=string(),
  baptism_place_name=string(),
  death_range=string(),
  death_start_day=double(),
  death_start_month=double(),
  death_start_year=double(),
  death_start_circa=string(),
  death_end_day=double(),
  death_end_month=double(),
  death_end_year=double(),
  death_end_circa=string(),
  cause_of_death=string(),
  death_city=string(),
  death_state=string(),
  death_county=string(),
  death_country=string(),
  death_place_name=string(),
  burial_range=string(),
  burial_start_day=double(),
  burial_start_month=double(),
  burial_start_year=double(),
  burial_start_circa=string(),
  burial_end_day=double(),
  burial_end_month=double(),
  burial_end_year=double(),
  burial_end_circa=string(),
  burial_city=string(),
  burial_state=string(),
  burial_county=string(),          
  burial_country=string(),
  burial_place_name=string(),       
  last_residence_city=string(),
  last_residence_state=string(),    
  last_residence_county=string(),
  last_residence_country=string(),
  last_residence_place_name=string()
)

csv_stream <- open_csv_dataset(csv_file,schema=sch,skip=1)

write_dataset(csv_stream, dest, format = "parquet", 
              max_rows_per_file=10000000L,
              hive_style = TRUE,
              existing_data_behavior = "overwrite")